Loading Library

library(RColorBrewer)
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.0     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.0
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(ggplot2)
library(readr)
library(data.table)
## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
library(tidyverse)
library(stringr)
library(dplyr)
library(formattable)
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:formattable':
## 
##     style
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(readr)
library(htmltools)
library(htmlwidgets)
bin001 <- read_csv("bin001_topfiftyhits2.txt", 
                   col_names = c("qseqid", "sseqid", "evalue", "staxids", "sscinames", "sskingdoms", "stitle"))
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 5847 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): qseqid, sseqid, staxids, sscinames, sskingdoms, stitle
## dbl (1): evalue
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
bin001_eval <- bin001 %>% 
  filter(as.numeric(evalue) <= 1e-10)
bin001_kingdomcount <- bin001_eval %>% 
  count(sskingdoms)
bin001_kingdomcount
## # A tibble: 7 × 2
##   sskingdoms           n
##   <chr>            <int>
## 1 Archaea             81
## 2 Archaea;Bacteria     1
## 3 Bacteria           705
## 4 Bacteria;Viruses     1
## 5 Eukaryota          710
## 6 N/A                  1
## 7 Viruses            506
piechart_bin001 <- ggplot(bin001_kingdomcount, aes(x="", y=n, fill=sskingdoms)) +
  geom_bar(stat="identity", width=1) +
  coord_polar("y", start=0) +
  scale_fill_brewer(palette = "PiYG") +
   theme(axis.text = element_blank(),
        axis.ticks = element_blank(),
        panel.grid  = element_blank())
piechart_bin001

bin001_rmduplicates <- bin001_eval %>% 
  group_by(qseqid) %>% 
  distinct(sseqid, .keep_all = TRUE)
bin001_eukaryotes <- bin001_rmduplicates %>%
  filter(sskingdoms == "Eukaryota") %>%
  group_by(qseqid, sseqid, staxids) %>%
  slice_min(order_by = evalue) %>%
  ungroup() %>% 
  mutate(genus = word(sscinames, 1))
bin001_eukaryotes
## # A tibble: 708 × 8
##    qseqid             sseqid         evalue staxids sscin…¹ sskin…² stitle genus
##    <chr>              <chr>           <dbl> <chr>   <chr>   <chr>   <chr>  <chr>
##  1 Contig156336240_22 dbj|BAB840… 2.39e- 99 41067   Asperg… Eukary… DNA t… Aspe…
##  2 Contig156336240_22 dbj|GBB894… 1.37e-102 94130   Rhizop… Eukary… hypot… Rhiz…
##  3 Contig156336240_22 dbj|GES916… 6.5 e-103 94130   Rhizop… Eukary… DNA t… Rhiz…
##  4 Contig156336240_22 dbj|GES916… 4.62e-101 94130   Rhizop… Eukary… DNA t… Rhiz…
##  5 Contig156336240_22 dbj|GHP123… 2.36e-100 41880   Pycnoc… Eukary… hypot… Pycn…
##  6 Contig156336240_22 emb|CAA278… 4.11e-101 4896    Schizo… Eukary… unnam… Schi…
##  7 Contig156336240_22 emb|CAB438… 9.78e-102 588596  Rhizop… Eukary… unnam… Rhiz…
##  8 Contig156336240_22 emb|CAB442… 7.61e-102 588596  Rhizop… Eukary… unnam… Rhiz…
##  9 Contig156336240_22 emb|CAB448… 6.41e-102 588596  Rhizop… Eukary… unnam… Rhiz…
## 10 Contig156336240_22 emb|CAB518… 1.25e-101 588596  Rhizop… Eukary… unnam… Rhiz…
## # … with 698 more rows, and abbreviated variable names ¹​sscinames, ²​sskingdoms
# Count the number of times each genus appears for a specific protein sequence
bin001_egenus_counts <- bin001_eukaryotes %>%
  group_by(qseqid, genus) %>%
  summarise(count = n(), .groups = "keep") %>% 
  filter(count > 1) # Only include those with more than one count 
print(bin001_egenus_counts)
## # A tibble: 118 × 3
## # Groups:   qseqid, genus [118]
##    qseqid             genus               count
##    <chr>              <chr>               <int>
##  1 Contig156336240_22 Paramecium              3
##  2 Contig156336240_22 Pomacea                 4
##  3 Contig156336240_22 Rhizophagus            16
##  4 Contig156336240_22 Schizosaccharomyces     4
##  5 Contig156336240_34 Aspergillus             6
##  6 Contig156336240_34 Beauveria               3
##  7 Contig156336240_34 Candida                 2
##  8 Contig156336240_34 Debaryomyces            2
##  9 Contig156336240_34 Exophiala               4
## 10 Contig156336240_34 Fonsecaea               2
## # … with 108 more rows
# Count the number of times each genus appears for a specific protein sequence
bin001_egenus_counts_3 <- bin001_eukaryotes %>%
  group_by(qseqid, genus) %>%
  summarise(count = n(), .groups = "keep") %>% 
  filter(count > 5) # Only include those with more than one count 
print(bin001_egenus_counts_3)
## # A tibble: 15 × 3
## # Groups:   qseqid, genus [15]
##    qseqid             genus           count
##    <chr>              <chr>           <int>
##  1 Contig156336240_22 Rhizophagus        16
##  2 Contig156336240_34 Aspergillus         6
##  3 Contig156336240_34 Paecilomyces        7
##  4 Contig156336241_2  Leishmania         12
##  5 Contig156336241_2  Trypanosoma         9
##  6 Contig156336241_26 Paramecium          9
##  7 Contig156336241_27 Cryptosporidium    11
##  8 Contig156336241_27 Entamoeba           6
##  9 Contig156336242_25 Colletotrichum     16
## 10 Contig156336242_33 Orbilia            10
## 11 Contig156336242_67 Ceratopteris        7
## 12 Contig156336242_67 Haematococcus       7
## 13 Contig156336243_33 Petromyzon          7
## 14 Contig156336243_33 Podila              8
## 15 Contig156336243_7  Penicillium        16
distinct_colors <- c("skyblue1", "#E6F5D0", "#B8E186", "#7FBC41", "#4D9221", "#276419", "#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", "#FF7F00", "#FFFF33", "#A65628", "#F781BF", "#999999", "#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99", "#E31A1C", "#FDBF6F", "#FF7F00", "#CAB2D6", "#6A3D9A", "#FFFF99", "#B15928", "#FBB4AE", "#B3CDE3", "#CCEBC5", "#DECBE4", "#FED9A6", "midnightblue", "#E5D8BD", "cornflowerblue", "#8DD3C7", "royalblue", "#BEBADA", "#FB8072", "#80B1D3", "#FDB462", "#B3DE69", "darkcyan", "#D9D9D9", "#BC80BD", "#CCEBC5", "chartreuse2", "#7FC97F", "#BEAED4", "purple", "cadetblue", "#386CB0", "#F0027F", "#BF5B17", "#666666", "#8E0152", "#C51B7D", "#DE77AE", "#F1B6DA", "#FDE0EF", "lightpink", "lightgreen", "mediumpurple1", "lightgoldenrod2", "lavender", "lavenderblush1", "lightblue2", "lightcyan1", "lemonchiffon2", "lawngreen", "khaki3", "ivory2", "indianred2", "hotpink2", "honeydew2", "mistyrose1", "midnightblue", "mediumvioletred", "mediumslateblue", "mediumpurple3", "mediumorchid1", "maroon2", "maroon", "limegreen", "lightsteelblue", "lightskyblue", "lightseagreen", "lightsalmon", "lightpink2", "pink", "peachpuff", "palevioletred1", "paleturquoise", "palegreen", "orchid1", "orange1", "olivedrab1", "purple2", "plum1", "steelblue1", "royalblue1", "skyblue",  "coral1", "chartreuse2", "darkgreen", "deepskyblue", "cornflowerblue", "darkcyan", "moccasin")
bin001_eukaryotes_ggplot <- ggplot(bin001_egenus_counts_3, aes(x = genus, y = count)) +
  geom_bar(stat = "identity", width = 0.7, position = "stack") +
  scale_y_continuous(labels = function(y) str_wrap(y, width = 3)) +
  theme_minimal() +
  theme(text = element_text(size = 25), axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "bottom")
bin001_eukaryotes_ggplot

bin001_eukaryotes_ggplot <- ggplot(bin001_egenus_counts, aes(x = qseqid, y = count, fill = genus)) +
  geom_bar(stat = "identity", width = 0.7, position = "stack") +
  scale_y_continuous(labels = function(y) str_wrap(y, width = 3)) +
  scale_fill_manual(values = distinct_colors) +
  theme_minimal() +
  theme(text = element_text(size = 15), axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "bottom")
bin001_eukaryotes_ggplot

ggplotly(bin001_eukaryotes_ggplot)